heartdata <- data.frame(read.csv("heart_2020_cleaned.csv"))

n=10
#heartdata <- heartdata[seq(1,nrow(heartdata),n),]

heartdata$HeartDisease <- as.factor(heartdata$HeartDisease)
heartdata$Smoking <- as.factor(heartdata$Smoking)
heartdata$AlcoholDrinking <- as.factor(heartdata$AlcoholDrinking)
heartdata$Stroke <- as.factor(heartdata$Stroke)
heartdata$DiffWalking <- as.factor(heartdata$DiffWalking)
heartdata$Sex <- as.factor(heartdata$Sex)
heartdata$Race <- gsub("American Indian/Alaskan Native","Native",heartdata$Race)
heartdata$Race <- factor(heartdata$Race,levels=c("White","Hispanic","Black","Asian","Native","Other"))
heartdata$Diabetic <- gsub("No, borderline diabetes","Borderline",heartdata$Diabetic)
heartdata$Diabetic <- factor(heartdata$Diabetic,levels=c("No","Borderline","Yes (during pregnancy)","Yes"))
heartdata$PhysicalActivity <- as.factor(heartdata$PhysicalActivity)
heartdata$GenHealth <- factor(heartdata$GenHealth,levels=c("Poor","Fair","Good","Very good","Excellent"))
heartdata$Asthma <- as.factor(heartdata$Asthma)
heartdata$KidneyDisease <- as.factor(heartdata$KidneyDisease)
heartdata$SkinCancer <- as.factor(heartdata$SkinCancer)

heartdata$AgeCategory <- gsub(" or older","-84",as.character(heartdata$AgeCategory))
heartdata$HiAge <- as.numeric(substr(heartdata$AgeCategory,4,5))
heartdata$rand <- sample(0:4,size=nrow(heartdata),replace=T)
heartdata$Age <- with(heartdata,HiAge-rand)
#heartdata <- subset(heartdata,select=-c(AgeCategory,HiAge,rand))

posdis <- subset(heartdata,HeartDisease=="Yes")
negdis <- subset(heartdata,HeartDisease=="No")

str(heartdata)
# My functions

GeomSplitViolin <- ggproto("GeomSplitViolin", GeomViolin, draw_group = function(self, data, ..., draw_quantiles = NULL){
  data <- transform(data, xminv = x - violinwidth * (x - xmin), xmaxv = x + violinwidth * (xmax - x))
  grp <- data[1,'group']
  newdata <- plyr::arrange(transform(data, x = if(grp%%2==1) xminv else xmaxv), if(grp%%2==1) y else -y)
  newdata <- rbind(newdata[1, ], newdata, newdata[nrow(newdata), ], newdata[1, ])
  newdata[c(1,nrow(newdata)-1,nrow(newdata)), 'x'] <- round(newdata[1, 'x']) 
  if (length(draw_quantiles) > 0 & !scales::zero_range(range(data$y))) {
    stopifnot(all(draw_quantiles >= 0), all(draw_quantiles <= 
                                              1))
    quantiles <- create_quantile_segment_frame(data, draw_quantiles)
    aesthetics <- data[rep(1, nrow(quantiles)), setdiff(names(data), c("x", "y")), drop = FALSE]
    aesthetics$alpha <- rep(1, nrow(quantiles))
    both <- cbind(quantiles, aesthetics)
    quantile_grob <- GeomPath$draw_panel(both, ...)
    ggplot2:::ggname("geom_split_violin", grobTree(GeomPolygon$draw_panel(newdata, ...), quantile_grob))
  }
  else {
    ggplot2:::ggname("geom_split_violin", GeomPolygon$draw_panel(newdata, ...))
  }
})

geom_split_violin <- function (mapping = NULL, data = NULL, stat = "ydensity", position = "identity", ..., draw_quantiles = NULL, trim = TRUE, scale = "area", na.rm = FALSE, show.legend = NA, inherit.aes = TRUE) {
  layer(data = data, mapping = mapping, stat = stat, geom = GeomSplitViolin, position = position, show.legend = show.legend, inherit.aes = inherit.aes, params = list(trim = trim, scale = scale, draw_quantiles = draw_quantiles, na.rm = na.rm, ...))
}

getmode <- function(v) {
   uniqv <- unique(v)
   uniqv[which.max(tabulate(match(v, uniqv)))]
}
# Histograms of numerical variables

ggplot(data=heartdata, aes(x=BMI))+geom_histogram(bins=50)+xlim(0,75)+ggtitle("Distribution of BMI")+geom_vline(xintercept = mean(heartdata$BMI,na.rm = TRUE), color = "red", size=1.5)+geom_vline(xintercept = median(heartdata$BMI,na.rm = TRUE), color = "blue", size=1.5)+geom_vline(xintercept = getmode(heartdata$BMI), color = "orange", size=1.5)

ggplot(data=heartdata, aes(x=PhysicalHealth))+geom_histogram(bins=30)

ggplot(data=heartdata, aes(x=MentalHealth))+geom_histogram(bins=30)

ggplot(data=heartdata, aes(x=SleepTime))+geom_histogram(bins=12)+xlim(2,13)+ggtitle("Distribution of Sleep Time")+geom_vline(xintercept = mean(heartdata$SleepTime,na.rm = TRUE), color = "red", size=1.5)+geom_vline(xintercept = median(heartdata$SleepTime,na.rm = TRUE), color = "blue", size=1.5)+geom_vline(xintercept = getmode(heartdata$SleepTime), color = "orange", size=1.5)

ggplot(data=heartdata, aes(x=Age))+geom_histogram(bins=65)+ggtitle("Distribution of Ages")+geom_vline(xintercept = mean(heartdata$Age,na.rm = TRUE), color = "red", size=1.5)+geom_vline(xintercept = median(heartdata$Age,na.rm = TRUE), color = "blue", size=1.5)+geom_vline(xintercept = getmode(heartdata$Age), color = "orange", size=1.5)

# Comparison of factors

print("Heart disease:")
hea<-table(heartdata$HeartDisease)
hea
#No heart disease
hea[1]/(hea[1]+hea[2])
#Heart disease
hea[2]/(hea[1]+hea[2])

print("Smoking and heart disease:")
smo<-table(heartdata$HeartDisease,heartdata$Smoking)
smo
#No smoking, no heart disease
smo[1]
smo[1]/(smo[1]+smo[2])
#No smoking, heart disease
smo[2]
smo[2]/(smo[1]+smo[2])
#Smoking, no heart disease
smo[3]
smo[3]/(smo[3]+smo[4])
#Smoking, heart disease
smo[4]
smo[4]/(smo[3]+smo[4])

print("Drinking and heart disease:")
dri<-table(heartdata$HeartDisease,heartdata$AlcoholDrinking)
dri
#No drinking, no heart disease
dri[1]
dri[1]/(dri[1]+dri[2])
#No drinking, heart disease
dri[2]
dri[2]/(dri[1]+dri[2])
#Drinking, no heart disease
dri[3]
dri[3]/(dri[3]+dri[4])
#Drinking, heart disease
dri[4]
dri[4]/(dri[3]+dri[4])

print("Diabetes and heart disease")
dia<-table(heartdata$Diabetic,heartdata$HeartDisease)
dia

print("Stroke and heart disease")
stro<-table(heartdata$Stroke,heartdata$HeartDisease)
stro

print("Walking and heart disease")
wal<-table(heartdata$DiffWalking,heartdata$HeartDisease)
wal

print("Physical activity and heart disease")
phy<-table(heartdata$PhysicalActivity,heartdata$HeartDisease)
phy

print("Asthma and heart disease")
ast<-table(heartdata$Asthma,heartdata$HeartDisease)
ast

print("Kidney disease and heart disease")
kid<-table(heartdata$KidneyDisease,heartdata$HeartDisease)
kid

print("Skin cancer and heart disease")
ski<-table(heartdata$SkinCancer,heartdata$HeartDisease)
ski
# Heart disease and Age

ggplot(heartdata, aes(x=HeartDisease, y=Age, fill=HeartDisease)) + geom_boxplot(outlier.shape = NA)+ggtitle("Heart Disease by Age")+coord_flip()

agelist <- sort(unique(heartdata$Age))
Age <- c()
PositiveMale <- c()
PositiveFemale <- c()
TotalMale <- c()
TotalFemale <- c()
Male <- c()
Female <- c()
Sex <- c()
Positive <- c()
Total <- c()

for (x in 1:length(agelist)){
  Age <- c(Age,agelist[x])
  PositiveMale <- c(PositiveMale, nrow(subset(heartdata,Age==agelist[x] & HeartDisease=="Yes" & Sex=="Male")))
  PositiveFemale <- c(PositiveFemale, nrow(subset(heartdata,Age==agelist[x] & HeartDisease=="Yes" & Sex=="Female")))
  TotalMale <- c(TotalMale, nrow(subset(heartdata,Age==agelist[x] & Sex=="Male")))
  TotalFemale <- c(TotalFemale, nrow(subset(heartdata,Age==agelist[x] & Sex=="Female")))
  Male <- c(Male,"Male")
  Female <- c(Female, "Female")
}

Sex <- c(Male,Female)
Positive <- c(PositiveMale,PositiveFemale)
Total <- c(TotalMale,TotalFemale)
agedata <- data.frame(Age, Positive, Total, Sex)
agedata['Percentage'] <- 100*Positive/Total
agedata$Sex <- factor(agedata$Sex,levels=c("Male","Female"))

ggplot(agedata, aes(x=Age,y=Percentage,fill=Sex)) + geom_area(position="identity") +scale_fill_manual(values=c("blue","magenta")) + ggtitle("Heart Disease by Age and Sex")

#ggplot(subset(agedata,Sex=="Female"), aes(x=Age,y=Percentage)) + geom_area()
# Heart disease and BMI

ggplot(heartdata, aes(x=HeartDisease, y=BMI, fill=HeartDisease)) + geom_boxplot(outlier.shape = NA)+ggtitle("Heart Disease by BMI")+ylim(15,45)

# Heart disease versus gen health

ggplot(posdis, aes(x=GenHealth,fill=GenHealth)) + geom_bar()+ggtitle("General Health for People With Heart Disease")

ggplot(negdis, aes(x=GenHealth,fill=GenHealth)) + geom_bar()+ggtitle("General Health for People Without Heart Disease")

ggplot(heartdata, aes(x=GenHealth)) + geom_bar(aes(fill=HeartDisease))+ggtitle("General Health for People With and Without Heart Disease")

ggplot(heartdata,aes(GenHealth,Age,fill=HeartDisease))+geom_split_violin()+ggtitle("Age and General Health for People With and Without Heart Disease")

# BMI and health

ggplot(heartdata, aes(x=GenHealth, y=BMI, fill=GenHealth)) + geom_boxplot(outlier.shape = NA)+ggtitle("General Health by BMI")+ylim(15,45)

# Race

ggplot(heartdata, aes(x=Race)) + geom_bar(aes(fill=HeartDisease))+ggtitle("Race of People With and Without Heart Disease")

ggplot(heartdata,aes(Race,Age,fill=HeartDisease))+geom_split_violin()+ggtitle("Age and Race for People With and Without Heart Disease")

rac <- table(heartdata$HeartDisease,heartdata$Race)
Race <- c("White","Hispanic","Black","Asian","Native","Other")
Percentage <- c(100*rac[2]/(rac[1]+rac[2]),100*rac[4]/(rac[3]+rac[4]),100*rac[6]/(rac[5]+rac[6]),100*rac[8]/(rac[7]+rac[8]),100*rac[10]/(rac[11]+rac[10]),100*rac[12]/(rac[11]+rac[12]))
racedata <- data.frame(Race,Percentage)
racedata$Race <- factor(racedata$Race,levels=c("White","Hispanic","Black","Asian","Native","Other"))

ggplot(racedata,aes(x=Race,y=Percentage,fill=Race))+geom_bar(stat="identity")+ggtitle("Percentage of People With Heart Disease")

# Pie graphs

hear<-table(heartdata$HeartDisease)
pie(hear,col = rainbow(length(hear)))

smok<-table(heartdata$Smoking)
pie(smok,col = rainbow(length(smok)))

drin<-table(heartdata$AlcoholDrinking)
pie(drin,col = rainbow(length(drin)))

race<-table(heartdata$Race)
pie(race,col = rainbow(length(race)))

sexx<-table(heartdata$Sex)
pie(sexx,col = rainbow(length(sexx)))

# Mental health

heartdata2 <- heartdata
heartdata2 <- heartdata2[heartdata2$MentalHealth != 0,]
heartdata2$SleepTime <-ifelse(heartdata2$SleepTime<7,"Too little",ifelse(heartdata2$SleepTime>9,"Too much","Recommended"))
heartdata2$Sleep <- factor(heartdata2$SleepTime,levels=c("Too little","Recommended","Too much"))

#+coord_cartesian(ylim = c(0, 12))
ggplot(heartdata2, aes(x=Smoking, y=MentalHealth, fill=Smoking)) + geom_boxplot(outlier.shape = NA)+ggtitle("Mental Health for Smokers and Non-Smokers")

ggplot(heartdata2, aes(x=AlcoholDrinking, y=MentalHealth, fill=AlcoholDrinking)) + geom_boxplot(outlier.shape = NA)+ggtitle("Mental Health for Drinkers and Non-Drinkers")

ggplot(heartdata2, aes(x=PhysicalActivity, y=MentalHealth, fill=PhysicalActivity)) + geom_boxplot(outlier.shape = NA)+ggtitle("Mental Health versus Physical Activity")

ggplot(heartdata2, aes(x=Sleep, y=MentalHealth, fill=Sleep)) + geom_boxplot(outlier.shape = NA)+ggtitle("Mental Health versus Sleep")

Testing

###Chi-Square Test Is heart disease data related to gender??

sextable= table(heartdata$HeartDisease,heartdata$Sex)

ezids::xkabledply(sextable, title="Contingency table for Heart Disease  vs Gender ")
Contingency table for Heart Disease vs Gender
Female Male
No 156571 135851
Yes 11234 16139
chitest_sex = chisq.test(sextable)
chitest_sex
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  sextable
## X-squared = 1568, df = 1, p-value <2e-16

Here, P value less than 0.05. That reject null Hypothesis. That’s mean heart disease data and gender are related to each other. Does the data support that race very much affects heart disease???

racetable = xtabs(~ Race+HeartDisease, data =heartdata )
racetable
##           HeartDisease
## Race           No    Yes
##   White    222705  22507
##   Hispanic  26003   1443
##   Black     21210   1729
##   Asian      7802    266
##   Native     4660    542
##   Other     10042    886
chitest_race = chisq.test(racetable)
chitest_race
## 
##  Pearson's Chi-squared test
## 
## data:  racetable
## X-squared = 844, df = 5, p-value <2e-16

Here, P value less than 0.05. That reject null Hypothesis. That’s mean data supports race has effect on heart disease.

Does the data support that Heart Disease effect on Gen Health??

gentable= table(heartdata$HeartDisease,heartdata$GenHealth)
ezids::xkabledply(gentable, title="Contingency table for Heart Disease  vs Gen Health ")
Contingency table for Heart Disease vs Gen Health
Poor Fair Good Very good Excellent
No 7439 27593 83571 108477 65342
Yes 3850 7084 9558 5381 1500
chitest_gen = chisq.test(gentable)
chitest_gen
## 
##  Pearson's Chi-squared test
## 
## data:  gentable
## X-squared = 21542, df = 4, p-value <2e-16

Here, P value less than 0.05. That reject null Hypothesis. That’s mean data support that Heat Disease effect on Gen Health ###T-test What is the average age people are suffering from heart disease??

 heart_disease_on=subset(heartdata,HeartDisease=="Yes")
ttest_age <- t.test(heart_disease_on$Age)
ttest_age
## 
##  One Sample t-test
## 
## data:  heart_disease_on$Age
## t = 934, df = 27372, p-value <2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  68.0 68.3
## sample estimates:
## mean of x 
##      68.2

Average of heaving heart disease is 68. ###Test For Association

What variables affect mental health physical health? In particular, does alcohol drinking, smoking

menhealth_smoke<-cor.test(heartdata$MentalHealth,as.numeric(heartdata$Smoking), method="pearson")
menhealth_smoke
## 
##  Pearson's product-moment correlation
## 
## data:  heartdata$MentalHealth and as.numeric(heartdata$Smoking)
## t = 48, df = 319793, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.0817 0.0886
## sample estimates:
##    cor 
## 0.0852
menhealth_drinking<-cor.test(heartdata$MentalHealth,as.numeric(heartdata$AlcoholDrinking), method="pearson")
menhealth_drinking
## 
##  Pearson's product-moment correlation
## 
## data:  heartdata$MentalHealth and as.numeric(heartdata$AlcoholDrinking)
## t = 29, df = 319793, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.0478 0.0547
## sample estimates:
##    cor 
## 0.0513

Smoking and drinking variables do not have strong correlation with mental health.Smoking has more stronger correlation than drinking alcohol with mental health.

phyhealth_smoke<-cor.test(heartdata$PhysicalHealth,as.numeric(heartdata$Smoking), method="pearson")
phyhealth_smoke
## 
##  Pearson's product-moment correlation
## 
## data:  heartdata$PhysicalHealth and as.numeric(heartdata$Smoking)
## t = 66, df = 319793, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.112 0.119
## sample estimates:
##   cor 
## 0.115
phyhealth_drinking<-cor.test(heartdata$PhysicalHealth,as.numeric(heartdata$AlcoholDrinking), method="pearson")
phyhealth_drinking
## 
##  Pearson's product-moment correlation
## 
## data:  heartdata$PhysicalHealth and as.numeric(heartdata$AlcoholDrinking)
## t = -10, df = 319793, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.0207 -0.0138
## sample estimates:
##     cor 
## -0.0173

Smoking and drinking variables do not have strong correlation with physical health.Smoking has more stronger correlation than drinking alcohol with physical health where drinking alcohol is negatively correlated.

Model building

SMART Question

What variables affect instances of heart disease?

Pre-processing amd balancing the data

Our goal is to find out the people who are likely to have heart disease in the future, so we can take some actions like a more detailed physical examination before the conditions become worse.

heartdata2 <- data.frame(read.csv("heart_2020_cleaned.csv"))

n=10
heartdata2$Smoking <- as.factor(heartdata2$Smoking)
heartdata2$AlcoholDrinking <- as.factor(heartdata2$AlcoholDrinking)
heartdata2$Stroke <- as.factor(heartdata2$Stroke)
heartdata2$DiffWalking <- as.factor(heartdata2$DiffWalking)
heartdata2$Sex <- as.factor(heartdata2$Sex)
heartdata2$Race <- as.factor(heartdata2$Race)

heartdata2[heartdata2$Diabetic == 'Yes (during pregnancy)', "Diabetic"] <- 'Yes'
heartdata2$Diabetic <- as.factor(heartdata2$Diabetic)

heartdata2$PhysicalActivity <- as.factor(heartdata2$PhysicalActivity)

heartdata2[heartdata2$GenHealth == 'Poor', "GenHealth"] <- 0
heartdata2[heartdata2$GenHealth == 'Fair', "GenHealth"] <- 1
heartdata2[heartdata2$GenHealth == 'Good', "GenHealth"] <- 2
heartdata2[heartdata2$GenHealth == 'Very good', "GenHealth"] <- 3
heartdata2[heartdata2$GenHealth == 'Excellent', "GenHealth"] <- 4
#heartdata2$GenHealth <- as.factor(heartdata2$GenHealth)
heartdata2$GenHealth <- as.numeric(heartdata2$GenHealth)

heartdata2$Asthma <- as.factor(heartdata2$Asthma)
heartdata2$KidneyDisease <- as.factor(heartdata2$KidneyDisease)
heartdata2$SkinCancer <- as.factor(heartdata2$SkinCancer)

heartdata2$AgeCategory <- gsub(" or older","-4",as.character(heartdata2$AgeCategory))
heartdata2$LoAge <- as.numeric(substr(heartdata2$AgeCategory,1,2))
heartdata2$rand <- sample(0:4,size=nrow(heartdata2),replace=T)
heartdata2$Age <- with(heartdata2,LoAge+rand)
heartdata2 <- subset(heartdata2,select=-c(AgeCategory,LoAge,rand))

heartdata2$y <- as.factor(heartdata2$HeartDisease)
heartdata2 = subset(heartdata2, select = -c(HeartDisease))

str(heartdata2)

At first, because we need to do the feature selection later, I put the HeartDisease column in the end of the dataset and rename it into y.

To get a look of the proportion of heart disease data before we continue our research.

table( heartdata2$y )
## 
##     No    Yes 
## 292422  27373

The dataset is very unbalanced, Considering that the dataset is large, so I use undersampling methods to balance the dataset. Reference: https://www.analyticsvidhya.com/blog/2016/03/practical-guide-deal-imbalanced-classification-problems/

loadPkg("ROSE")
data_balanced_under <- ovun.sample(y ~ ., data = heartdata2, method = "under", N = 27373*2, seed = 1)$data
rm(heartdata2)
unloadPkg("ROSE") 

We can find that the data is really balanced.

table( data_balanced_under$y )
## 
##    No   Yes 
## 27373 27373

And also the structure of the dataset.

str(data_balanced_under)
## 'data.frame':    54746 obs. of  18 variables:
##  $ BMI             : num  30.9 21.9 25.8 25.8 30.1 ...
##  $ Smoking         : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 2 ...
##  $ AlcoholDrinking : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 2 1 1 ...
##  $ Stroke          : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PhysicalHealth  : num  0 0 1 5 1 0 0 0 3 0 ...
##  $ MentalHealth    : num  0 0 0 0 0 0 0 3 10 0 ...
##  $ DiffWalking     : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Sex             : Factor w/ 2 levels "Female","Male": 2 2 1 2 1 1 2 1 1 1 ...
##  $ Race            : Factor w/ 6 levels "American Indian/Alaskan Native",..: 6 6 6 4 6 6 6 6 6 6 ...
##  $ Diabetic        : Factor w/ 3 levels "No","No, borderline diabetes",..: 1 1 1 1 1 1 1 1 3 1 ...
##  $ PhysicalActivity: Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ GenHealth       : num  4 4 3 4 3 3 2 3 2 3 ...
##  $ SleepTime       : num  8 6 8 6 6 8 5 8 6 8 ...
##  $ Asthma          : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ KidneyDisease   : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 1 1 1 ...
##  $ SkinCancer      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 1 1 1 ...
##  $ Age             : num  55 25 81 32 59 66 27 34 63 64 ...
##  $ y               : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...

logistic regression model

To train and evaluate the model, we will split the dataset into two parts. 80 percent of the dataset will be used to train the model and the rest will be used to test the accuracy of the model.

loadPkg("caret")
set.seed(4321)
test <- createDataPartition( data_balanced_under$y, p = .2, list = FALSE )
data_train <- data_balanced_under[ -test, ]
data_test  <- data_balanced_under[ test, ]
unloadPkg("caret") 
model_glm <- glm( y ~ ., data = data_train, family = binomial(logit) )
summary_glm <- summary(model_glm)
summary_glm
## 
## Call:
## glm(formula = y ~ ., family = binomial(logit), data = data_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.0217  -0.7842  -0.0249   0.8147   2.9765  
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     -3.375878   0.144134  -23.42  < 2e-16 ***
## BMI                              0.011718   0.001978    5.92  3.2e-09 ***
## SmokingYes                       0.390957   0.024277   16.10  < 2e-16 ***
## AlcoholDrinkingYes              -0.171383   0.053407   -3.21  0.00133 ** 
## StrokeYes                        1.194871   0.050744   23.55  < 2e-16 ***
## PhysicalHealth                   0.003983   0.001548    2.57  0.01008 *  
## MentalHealth                     0.005539   0.001582    3.50  0.00046 ***
## DiffWalkingYes                   0.213718   0.033258    6.43  1.3e-10 ***
## SexMale                          0.734255   0.024608   29.84  < 2e-16 ***
## RaceAsian                       -0.482339   0.133234   -3.62  0.00029 ***
## RaceBlack                       -0.370447   0.101298   -3.66  0.00026 ***
## RaceHispanic                    -0.209198   0.102247   -2.05  0.04076 *  
## RaceOther                       -0.173323   0.112167   -1.55  0.12229    
## RaceWhite                       -0.195416   0.091551   -2.13  0.03280 *  
## DiabeticNo, borderline diabetes  0.198445   0.074399    2.67  0.00765 ** 
## DiabeticYes                      0.486828   0.030465   15.98  < 2e-16 ***
## PhysicalActivityYes             -0.032212   0.028335   -1.14  0.25561    
## GenHealth                       -0.503055   0.014083  -35.72  < 2e-16 ***
## SleepTime                       -0.030295   0.007678   -3.95  8.0e-05 ***
## AsthmaYes                        0.298173   0.034461    8.65  < 2e-16 ***
## KidneyDiseaseYes                 0.663836   0.052098   12.74  < 2e-16 ***
## SkinCancerYes                    0.145508   0.035878    4.06  5.0e-05 ***
## Age                              0.058546   0.000947   61.80  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 60714  on 43795  degrees of freedom
## Residual deviance: 43318  on 43773  degrees of freedom
## AIC: 43364
## 
## Number of Fisher Scoring iterations: 5

We can find from the model that the p-value of Race and PhysicalActivity are larger than 0.05, which means they are insignificant. So just drop these two variables and make a logistic regression model again.

model2_glm <- glm( y ~ . - Race - PhysicalActivity, data = data_train, family = binomial(logit) )
summary2_glm <- summary(model2_glm)
summary2_glm
## 
## Call:
## glm(formula = y ~ . - Race - PhysicalActivity, family = binomial(logit), 
##     data = data_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.0166  -0.7839  -0.0258   0.8159   2.9828  
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     -3.647232   0.110721  -32.94  < 2e-16 ***
## BMI                              0.011943   0.001965    6.08  1.2e-09 ***
## SmokingYes                       0.400204   0.024130   16.59  < 2e-16 ***
## AlcoholDrinkingYes              -0.165016   0.053373   -3.09  0.00199 ** 
## StrokeYes                        1.189634   0.050649   23.49  < 2e-16 ***
## PhysicalHealth                   0.004317   0.001542    2.80  0.00511 ** 
## MentalHealth                     0.005698   0.001580    3.61  0.00031 ***
## DiffWalkingYes                   0.216886   0.032854    6.60  4.1e-11 ***
## SexMale                          0.735589   0.024534   29.98  < 2e-16 ***
## DiabeticNo, borderline diabetes  0.189406   0.074246    2.55  0.01074 *  
## DiabeticYes                      0.482124   0.030343   15.89  < 2e-16 ***
## GenHealth                       -0.502807   0.013947  -36.05  < 2e-16 ***
## SleepTime                       -0.029368   0.007669   -3.83  0.00013 ***
## AsthmaYes                        0.298754   0.034427    8.68  < 2e-16 ***
## KidneyDiseaseYes                 0.664202   0.052078   12.75  < 2e-16 ***
## SkinCancerYes                    0.156204   0.035617    4.39  1.2e-05 ***
## Age                              0.058824   0.000931   63.22  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 60714  on 43795  degrees of freedom
## Residual deviance: 43346  on 43779  degrees of freedom
## AIC: 43380
## 
## Number of Fisher Scoring iterations: 5

We’ll quickly check two things for this model. First the p-values. Values below .05 indicates significance, which means the coefficient or so called parameters that are estimated by our model are reliable. And second, the pseudo R square. This value ranging from 0 to 1 indicates how much variance is explained by our model.

All the p-values of the models indicates significance, meaning that our model is a legitimate one. But R square of 0.29 tells that 29 percent of the variance is explained.

list( summary2_glm$coefficient, 
      round( 1 - ( summary2_glm$deviance / summary2_glm$null.deviance ), 2 ) )

Have a look of the vif value.

vif_md2 = faraway::vif(model2_glm)
vif_md2
##                             BMI                      SmokingYes 
##                            7.11                            6.37 
##              AlcoholDrinkingYes                       StrokeYes 
##                            6.52                            9.62 
##                  PhysicalHealth                    MentalHealth 
##                           10.42                            8.05 
##                  DiffWalkingYes                         SexMale 
##                            8.70                            6.57 
## DiabeticNo, borderline diabetes                     DiabeticYes 
##                            5.59                            7.09 
##                       GenHealth                       SleepTime 
##                           11.08                            6.67 
##                       AsthmaYes                KidneyDiseaseYes 
##                            6.79                            8.44 
##                   SkinCancerYes                             Age 
##                            6.40                           11.13

We can find that some vif values are larger than 10, which means these variables are highly correlated, not acceptable. So I tried to drop one variable a time. At mean time, look at the p-value to ensure that the variables are significant. At the end, we got the model like below:

model3_glm <- glm( y ~ . - Race - PhysicalActivity - Age - Asthma - PhysicalHealth, data = data_train, family = binomial(logit) )
summary3_glm <- summary(model3_glm)
summary3_glm
## 
## Call:
## glm(formula = y ~ . - Race - PhysicalActivity - Age - Asthma - 
##     PhysicalHealth, family = binomial(logit), data = data_train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.370  -0.858  -0.120   0.941   2.297  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      0.21961    0.08471    2.59  0.00953 ** 
## BMI                             -0.00625    0.00181   -3.45  0.00055 ***
## SmokingYes                       0.48167    0.02257   21.34  < 2e-16 ***
## AlcoholDrinkingYes              -0.40284    0.04980   -8.09  6.0e-16 ***
## StrokeYes                        1.36603    0.04933   27.69  < 2e-16 ***
## MentalHealth                    -0.01653    0.00141  -11.74  < 2e-16 ***
## DiffWalkingYes                   0.62529    0.03037   20.59  < 2e-16 ***
## SexMale                          0.59018    0.02277   25.91  < 2e-16 ***
## DiabeticNo, borderline diabetes  0.42327    0.07167    5.91  3.5e-09 ***
## DiabeticYes                      0.71741    0.02907   24.68  < 2e-16 ***
## GenHealth                       -0.57790    0.01223  -47.24  < 2e-16 ***
## SleepTime                        0.03334    0.00728    4.58  4.7e-06 ***
## KidneyDiseaseYes                 0.85136    0.05099   16.70  < 2e-16 ***
## SkinCancerYes                    0.74038    0.03395   21.81  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 60714  on 43795  degrees of freedom
## Residual deviance: 48335  on 43782  degrees of freedom
## AIC: 48363
## 
## Number of Fisher Scoring iterations: 4
vif_md3 = faraway::vif(model3_glm)
vif_md3
##                             BMI                      SmokingYes 
##                            6.03                            5.58 
##              AlcoholDrinkingYes                       StrokeYes 
##                            5.68                            9.12 
##                    MentalHealth                  DiffWalkingYes 
##                            6.39                            7.44 
##                         SexMale DiabeticNo, borderline diabetes 
##                            5.66                            5.21 
##                     DiabeticYes                       GenHealth 
##                            6.51                            8.53 
##                       SleepTime                KidneyDiseaseYes 
##                            6.02                            8.09 
##                   SkinCancerYes 
##                            5.81

Now, the vif values are all below 10.

Feature selection

In this part, I want to use feature selection to find out the suitable variables from our current model. Because the data_train dataset is large and takes a lot of time to run. I changed to data_test to do the feature selection.

data_feature_selection = subset(data_test, select = -c(Race, PhysicalActivity, PhysicalHealth, Age, Asthma))
str(data_feature_selection)
loadPkg("bestglm")
res.bestglm <- bestglm(Xy = data_feature_selection, family = binomial,
            IC = "AIC",                 # Information criteria for
            method = "backward")
summary(res.bestglm)
## Fitting algorithm:  AIC-glm
## Best Model:
##               df deviance
## Null Model 10936    12036
## Full Model 10949    15180
## 
##  likelihood-ratio test - GLM
## 
## data:  H0: Null Model vs. H1: Best Fit AIC-glm
## X = 3144, df = 13, p-value <2e-16
res.bestglm$BestModels
##     BMI Smoking AlcoholDrinking Stroke MentalHealth DiffWalking  Sex Diabetic
## 1  TRUE    TRUE            TRUE   TRUE         TRUE        TRUE TRUE     TRUE
## 2  TRUE    TRUE            TRUE   TRUE         TRUE        TRUE TRUE     TRUE
## 3  TRUE    TRUE           FALSE   TRUE         TRUE        TRUE TRUE     TRUE
## 4  TRUE    TRUE           FALSE   TRUE         TRUE        TRUE TRUE     TRUE
## 5 FALSE    TRUE            TRUE   TRUE         TRUE        TRUE TRUE     TRUE
##   GenHealth SleepTime KidneyDisease SkinCancer Criterion
## 1      TRUE      TRUE          TRUE       TRUE     12062
## 2      TRUE     FALSE          TRUE       TRUE     12064
## 3      TRUE      TRUE          TRUE       TRUE     12067
## 4      TRUE     FALSE          TRUE       TRUE     12070
## 5      TRUE      TRUE          TRUE       TRUE     12070
summary(res.bestglm$BestModels)
##     BMI          Smoking        AlcoholDrinking  Stroke        MentalHealth  
##  Mode :logical   Mode:logical   Mode :logical   Mode:logical   Mode:logical  
##  FALSE:1         TRUE:5         FALSE:2         TRUE:5         TRUE:5        
##  TRUE :4                        TRUE :3                                      
##                                                                              
##                                                                              
##                                                                              
##  DiffWalking      Sex          Diabetic       GenHealth      SleepTime      
##  Mode:logical   Mode:logical   Mode:logical   Mode:logical   Mode :logical  
##  TRUE:5         TRUE:5         TRUE:5         TRUE:5         FALSE:2        
##                                                              TRUE :3        
##                                                                             
##                                                                             
##                                                                             
##  KidneyDisease  SkinCancer       Criterion    
##  Mode:logical   Mode:logical   Min.   :12062  
##  TRUE:5         TRUE:5         1st Qu.:12064  
##                                Median :12067  
##                                Mean   :12067  
##                                3rd Qu.:12070  
##                                Max.   :12070
unloadPkg("bestglm") 
unloadPkg("leaps") 

We can find from the feature selection that the best model has all these 13 variables, and its CIA (Akaike Information Criterion) is 12062, which is the lowest among these models.

Model Evaluation

ROC and AUC

Receiver-Operator-Characteristic (ROC) curve and Area-Under-Curve (AUC) measures the true positive rate (or sensitivity) against the false positive rate (or specificity). The area-under-curve is always between 0.5 and 1. Values higher than 0.8 is considered good model fit.

loadPkg("pROC") # receiver operating characteristic curve, gives the diagnostic ability of a binary classifier system as its discrimination threshold is varied. The curve is on sensitivity/recall/true-positive-rate vs false_alarm/false-positive-rate/fall-out.
prob=predict(model3_glm, type = "response" )
data_train$prob=prob
h <- roc(y~prob, data=data_train)
auc(h) # area-under-curve prefer 0.8 or higher.
plot(h)

# unloadPkg("pROC")

The AUC of the model is 0.795, which is a little bit lower than 0.8. Because our model looks suitalbe and we have all the needed features, I suppose that the data causes the AUC value is not high enough.

Confusion matrix

We can have a look of the Confusion matrix.

# install.packages("regclass")
library("regclass")
# confusion_matrix(admitLogit)
xkabledply( confusion_matrix(model3_glm), title = "Confusion matrix from Logit Model" )
Confusion matrix from Logit Model
Predicted No Predicted Yes Total
Actual No 16646 5252 21898
Actual Yes 6957 14941 21898
Total 23603 20193 43796
unloadPkg("regclass")

We can find from the confusion Matrix that Precision is 14941/(5252+14941) = 0.74, which means the valid of the result is 0.74. And the recall is 14941/(6957+14941) = 0.682, which means how complete the results are. In our model, actually, I think the recall is more important, because FN means heart disease patients who are missed by our model, which can cause a harmful result.

loadPkg("InformationValue")
predicted = predict(model3_glm, data_test, type = "response")

confusionMatrix(data_test$y, predicted)
##     No  Yes
## 0 4220 1753
## 1 1255 3722
unloadPkg("InformationValue")

Then we can use the test dataset to checkout whether the model is good to use. So I used the data_test to make a prediction and calculate the confusion Matrix by the test dataset. The Precision is 3722/(1255+3722) = 0.748 and the recall is 3722/(1753+3722) = 0.68. The Precision value and recall value of the test dataset is similiar to the train dataset, which means our model is reliable to predict the heart disease.

Interpretation and Reporting

We’ll return to our logistic regression model for a minute, and look at the estiamted parameters (coefficients). Since the model’s parameter the recorded in logit format, we’ll transform it into odds ratio so that it’ll be easier to interpret.

loadPkg("broom")

coefficient <- tidy(model3_glm)[ , c( "term", "estimate", "statistic" ) ]

# transfrom the coefficient to be in probability format 
coefficient$estimate <- exp( coefficient$estimate )
coefficient[sort(abs(coefficient$estimate),decreasing=T,index.return=T)[[2]],]
## # A tibble: 14 × 3
##    term                            estimate statistic
##    <chr>                              <dbl>     <dbl>
##  1 StrokeYes                          3.92      27.7 
##  2 KidneyDiseaseYes                   2.34      16.7 
##  3 SkinCancerYes                      2.10      21.8 
##  4 DiabeticYes                        2.05      24.7 
##  5 DiffWalkingYes                     1.87      20.6 
##  6 SexMale                            1.80      25.9 
##  7 SmokingYes                         1.62      21.3 
##  8 DiabeticNo, borderline diabetes    1.53       5.91
##  9 (Intercept)                        1.25       2.59
## 10 SleepTime                          1.03       4.58
## 11 BMI                                0.994     -3.45
## 12 MentalHealth                       0.984    -11.7 
## 13 AlcoholDrinkingYes                 0.668     -8.09
## 14 GenHealth                          0.561    -47.2
unloadPkg("broom")

We can find from the table that other diseases (stroke, kidney disease, diabetic, SkinCancer), general health conditions, sex, DiffWalking (serious difficulty walking or climbing stairs), and smoking habit all largely influence the possibility of heart disease. It’s a little weird that drinking alcohol will reduce the possibility of heart disease. As we always think, drinking is not a good habit, maybe the data also includes people who drink some little wine.

Classification and Regression Trees

#Creating new heartdata DF with different name
library(dplyr)
clean_heart<-heartdata
#Assigning ifelse logic to Mental Health. See if the person is not feeling ok in 15 days or more during the month
HighMH = ifelse(clean_heart$MentalHealth>=15, "No", "Yes")
clean_heart = data.frame(clean_heart, HighMH)
#Selecting only the meaningful columns for prediction
clean_heart <- select(clean_heart, HighMH, Smoking, Sex, AlcoholDrinking, PhysicalActivity)
clean_heart <- mutate(clean_heart, HighMH=factor(HighMH), Smoking=factor(Smoking), Sex=factor(Sex), PhysicalActivity=factor(PhysicalActivity))
library('rpart.plot')
create_train_test <- function(data, size = 0.8, train = TRUE) {
n_row = nrow(data)
total_row = size * n_row
train_sample = c(1: total_row)
if (train == TRUE) {
return (data[train_sample, ])
  } else {
return (data[-train_sample, ])
  }
}
#Splitting clean_heart into training and testing data
library(caTools)
set.seed(123)

data_train <- create_train_test(clean_heart, 0.8, train = TRUE)
data_test <- create_train_test(clean_heart, 0.8, train = FALSE)
dim(data_train)
dim(data_test)

#sample = sample.split(clean_heart$MentalHealth, SplitRatio = .70)
#train = subset(clean_heart, sample==TRUE)
#test = subset(clean_heart, sample==FALSE)
#Training the Decision Tree Classifier
tree <- rpart(Smoking ~., data=data_train, method="class", control = rpart.control(minsplit = 1, minbucket = 1, cp = 0.001))
print(tree)
summary(tree)
library(rpart)                      # Popular decision tree algorithm
library(rattle)                 # Fancy tree plot
library(rpart.plot)             # Enhanced tree plots
library(RColorBrewer)               # Color selection for fancy tree plot
library(party)                  # Alternative decision tree algorithm
library(partykit)               # Convert rpart object to BinaryTree
library(caret)  
plot(tree, uniform=TRUE, main="Classification Tree for Smoking")
text(tree, use.n=TRUE, all=TRUE, cex=.8)

col <- c("#FD8D3C", "#FD8D3C", "#FD8D3C", "#BCBDDC",
         "#FDD0A2", "#FD8D3C", "#BCBDDC")
prp(tree, type=2, extra=104, nn=TRUE, ni=TRUE, fallen.leaves=TRUE, 
    faclen=0, varlen=0, shadow.col="grey", branch.lty=3)

#we can alsos change extra=104
library("RColorBrewer")
fancyRpartPlot(tree, main="Classification Tree for Smoking", palettes="PuRd", type=2)

Classification and Regression Trees II

#Creating new heartdata DF with different name
library(dplyr)
clean_heart1<-heartdata
clean_heart1$SleepTime=as.numeric(clean_heart1$SleepTime)
clean_heart1$Age=as.numeric(clean_heart1$Age)
#Assigning ifelse logic to Mental Health. See if the person is not feeling ok in 15 days or more during the month
AvSleep = ifelse(clean_heart1$SleepTime>=7, "No", "Yes")
clean_heart1 = data.frame(clean_heart1, AvSleep)
#Assigning ifelse logic to Mental Health. See if the person is not feeling ok in 15 days or more during the month
Age30Plus = ifelse(clean_heart1$Age>=30, "No", "Yes")
clean_heart1 = data.frame(clean_heart1, Age30Plus)
#Selecting only the meaningful columns for prediction
clean_heart1 <- select(clean_heart1, Smoking, Age30Plus, AvSleep, Race, HeartDisease, PhysicalActivity)
clean_heart1 <- mutate(clean_heart1, Race=factor(Race), PhysicalActivity=factor(PhysicalActivity), Smoking=factor(Smoking))
library('rpart.plot')
create_train_test1 <- function(data, size = 0.8, train = TRUE) {
n_row = nrow(data)
total_row = size * n_row
train_sample = c(1: total_row)
if (train == TRUE) {
return (data[train_sample, ])
  } else {
return (data[-train_sample, ])
  }
}
#Splitting clean_heart into training and testing data
library(caTools)
set.seed(123)

data_train1 <- create_train_test1(clean_heart1, 0.8, train = TRUE)
data_test1 <- create_train_test1(clean_heart1, 0.8, train = FALSE)
dim(data_train1)
dim(data_test1)

#sample = sample.split(clean_heart$MentalHealth, SplitRatio = .70)
#train = subset(clean_heart, sample==TRUE)
#test = subset(clean_heart, sample==FALSE)
#Training the Decision Tree Classifier
tree1 <- rpart(Smoking ~., data=data_train1, method="class", control = rpart.control(minsplit = 1, minbucket = 1, cp = 0.001))
print(tree1)
summary(tree1)
library(rpart)                      # Popular decision tree algorithm
library(rattle)                 # Fancy tree plot
library(rpart.plot)             # Enhanced tree plots
library(RColorBrewer)               # Color selection for fancy tree plot
library(party)                  # Alternative decision tree algorithm
library(partykit)               # Convert rpart object to BinaryTree
library(caret)  
plot(tree1, uniform=TRUE, main="Classification Tree for Age")
text(tree1, use.n=TRUE, all=TRUE, cex=.8)

col <- c("#FD8D3C", "#FD8D3C", "#FD8D3C", "#BCBDDC",
         "#FDD0A2", "#FD8D3C", "#BCBDDC")
prp(tree1, type=2, extra=104, nn=TRUE, ni=TRUE, fallen.leaves=TRUE, 
    faclen=0, varlen=0, shadow.col="grey", branch.lty=3)

#we can alsos change extra=104
library("RColorBrewer")
fancyRpartPlot(tree1, main="Classification Tree for Smoking", palettes="PuRd", type=2)

#I created set of rules from the decision tree to see the probabilities per rule
rpart.rules(tree1)
#asRules(tree)